home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Language/OS - Multiplatform Resource Library
/
LANGUAGE OS.iso
/
a_utils
/
_archvrs
/
unix
/
arc521.lha
/
arc
/
arcsq.c
< prev
next >
Wrap
C/C++ Source or Header
|
1989-08-08
|
15KB
|
549 lines
/*
* $Header: arcsq.c,v 1.3 88/07/31 18:53:32 hyc Exp $
*/
/*
* ARC - Archive utility - ARCSQ
*
* Version 3.10, created on 01/30/86 at 20:10:46
*
* (C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED
*
* By: Thom Henderson
*
* Description: This file contains the routines used to squeeze a file when
* placing it in an archive.
*
* Language: Computer Innovations Optimizing C86
*
* Programming notes: Most of the routines used for the Huffman squeezing
* algorithm were lifted from the SQ program by Dick Greenlaw, as adapted to
* CI-C86 by Robert J. Beilstein.
*/
#include <stdio.h>
#include "arc.h"
/* stuff for Huffman squeezing */
#define TRUE 1
#define FALSE 0
#define ERROR (-1)
#define SPEOF 256 /* special endfile token */
#define NOCHILD (-1) /* marks end of path through tree */
#define NUMVALS 257 /* 256 data values plus SPEOF */
#define NUMNODES (NUMVALS+NUMVALS-1) /* number of nodes */
#define MAXCOUNT (unsigned short) 65535 /* biggest unsigned integer */
/*
* The following array of structures are the nodes of the binary trees. The
* first NUMVALS nodes become the leaves of the final tree and represent the
* values of the data bytes being encoded and the special endfile, SPEOF. The
* remaining nodes become the internal nodes of the final tree.
*/
struct nd { /* shared by unsqueezer */
unsigned short weight; /* number of appearances */
short tdepth; /* length on longest path in tree */
short lchild, rchild; /* indices to next level */
} node[NUMNODES]; /* use large buffer */
static int dctreehd; /* index to head of final tree */
/*
* This is the encoding table: The bit strings have first bit in low bit.
* Note that counts were scaled so code fits unsigned integer.
*/
static int codelen[NUMVALS]; /* number of bits in code */
static unsigned short code[NUMVALS]; /* code itself, right adjusted */
static unsigned short tcode; /* temporary code value */
static long valcount[NUMVALS]; /* actual count of times seen */
/* Variables used by encoding process */
static int curin; /* value currently being encoded */
static int cbitsrem; /* # of code string bits left */
static unsigned short ccode; /* current code right justified */
static void scale(), heap(), adjust(), bld_tree(), init_enc(), put_int();
static int cmptrees(), buildenc(), maxchar();
void
init_sq()
{ /* prepare for scanning pass */
int i; /* node index */
/*
* Initialize all nodes to single element binary trees with zero
* weight and depth.
*/
for (i = 0; i < NUMNODES; ++i) {
node[i].weight = 0;
node[i].tdepth = 0;
node[i].lchild = NOCHILD;
node[i].rchild = NOCHILD;
}
for (i = 0; i < NUMVALS; i++)
valcount[i] = 0;
}
void
scan_sq(c) /* add a byte to the tables */
int c; /* byte to add */
{
unsigned short *wp; /* speeds up weight counting */
/* Build frequency info in tree */
if (c == EOF) /* it's traditional */
c = SPEOF; /* dumb, but traditional */
if (*(wp = &node[c].weight) != MAXCOUNT)
++(*wp); /* bump weight counter */
valcount[c]++; /* bump byte counter */
}
long
pred_sq()
{ /* predict size of squeezed file */
int i;
int btlist[NUMVALS]; /* list of intermediate
* b-trees */
int listlen;/* length of btlist */
unsigned short ceiling;/* limit for scaling */
long size = 0; /* predicted size */
int numnodes; /* # of nodes in simplified tree */
scan_sq(EOF); /* signal end of input */
ceiling = MAXCOUNT;
/* Keep trying to scale and encode */
do {
scale(ceiling);
ceiling /= 2; /* in case we rescale */
/*
* Build list of single node binary trees having leaves for
* the input values with non-zero counts
*/
for (i = listlen = 0; i < NUMVALS; ++i) {
if (node[i].weight != 0) {
node[i].tdepth = 0;
btlist[listlen++] = i;
}
}
/*
* Arrange list of trees into a heap with the entry indexing
* the node with the least weight at the top.
*/
heap(btlist, listlen);
/* Convert the list of trees to a single decoding tree */
bld_tree(btlist, listlen);
/* Initialize the encoding table */
init_enc();
/*
* Try to build encoding table. Fail if any code is > 16 bits
* long.
*/
} while (buildenc(0, dctreehd) == ERROR);
/* Initialize encoding variables */
cbitsrem = 0; /* force initial read */
curin = 0; /* anything but endfile */
for (i = 0; i < NUMVALS; i++) /* add bits for each code */
size += valcount[i] * codelen[i];
size = (size + 7) / 8; /* reduce to number of bytes */
numnodes = dctreehd < NUMVALS ? 0 : dctreehd - (NUMVALS - 1);
size += sizeof(short) + 2 * numnodes * sizeof(short);
return size;
}
/*
* The count of number of occurrances of each input value have already been
* prevented from exceeding MAXCOUNT. Now we must scale them so that their
* sum doesn't exceed ceiling and yet no non-zero count can become zero. This
* scaling prevents errors in the weights of the interior nodes of the
* Huffman tree and also ensures that the codes will fit in an unsigned
* integer. Rescaling is used if necessary to limit the code length.
*/
static void
scale(ceil)
unsigned short ceil; /* upper limit on total weight */
{
register int i;
int ovflw, divisor;
unsigned short w, sum;
unsigned char increased; /* flag */
do {
for (i = sum = ovflw = 0; i < NUMVALS; ++i) {
if (node[i].weight > (ceil - sum))
++ovflw;
sum += node[i].weight;
}
divisor = ovflw + 1;
/* Ensure no non-zero values are lost */
increased = FALSE;
for (i = 0; i < NUMVALS; ++i) {
w = node[i].weight;
if (w < divisor && w != 0) { /* Don't fail to provide
* a code if it's used
* at all */
node[i].weight = divisor;
increased = TRUE;
}
}
} while (increased);
/* Scaling factor chosen, now scale */
if (divisor > 1)
for (i = 0; i < NUMVALS; ++i)
node[i].weight /= divisor;
}
/*
* heap() and adjust() maintain a list of binary trees as a heap with the top
* indexing the binary tree on the list which has the least weight or, in
* case of equal weights, least depth in its longest path. The depth part is
* not strictly necessary, but tends to avoid long codes which might provoke
* rescaling.
*/
static void
heap(list, length)
int list[], length;
{
register int i;
for (i = (length - 2) / 2; i >= 0; --i)
adjust(list, i, length - 1);
}
/* Make a heap from a heap with a new top */
static void
adjust(list, top, bottom)
int list[], top, bottom;
{
register int k, temp;
k = 2 * top + 1; /* left child of top */
temp = list[top]; /* remember root node of top tree */
if (k <= bottom) {
if (k < bottom && cmptrees(list[k], list[k + 1]))
++k;
/* k indexes "smaller" child (in heap of trees) of top */
/* now make top index "smaller" of old top and smallest child */
if (cmptrees(temp, list[k])) {
list[top] = list[k];
list[k] = temp;
/* Make the changed list a heap */
adjust(list, k, bottom); /* recursive */
}
}
}
/*
* Compare two trees, if a > b return true, else return false. Note
* comparison rules in previous comments.
*/
static int
cmptrees(a, b)
int a, b; /* root nodes of trees */
{
if (node[a].weight > node[b].weight)
return TRUE;
if (node[a].weight == node[b].weight)
if (node[a].tdepth > node[b].tdepth)
return TRUE;
return FALSE;
}
/*
* HUFFMAN ALGORITHM: develops the single element trees into a single binary
* tree by forming subtrees rooted in interior nodes having weights equal to
* the sum of weights of all their descendents and having depth counts
* indicating the depth of their longest paths.
*
* When all trees have been formed into a single tree satisfying the heap
* property (on weight, with depth as a tie breaker) then the binary code
* assigned to a leaf (value to be encoded) is then the series of left (0)
* and right (1) paths leading from the root to the leaf. Note that trees are
* removed from the heaped list by moving the last element over the top
* element and reheaping the shorter list.
*/
static void
bld_tree(list, len)
int list[];
int len;
{
register int freenode; /* next free node in tree */
register struct nd *frnp; /* free node pointer */
int lch, rch; /* temps for left, right children */
/*
* Initialize index to next available (non-leaf) node. Lower numbered
* nodes correspond to leaves (data values).
*/
freenode = NUMVALS;
while (len > 1) { /* Take from list two btrees with least
* weight and build an interior node pointing
* to them. This forms a new tree. */
lch = list[0]; /* This one will be left child */
/* delete top (least) tree from the list of trees */
list[0] = list[--len];
adjust(list, 0, len - 1);
/* Take new top (least) tree. Reuse list slot later */
rch = list[0]; /* This one will be right child */
/*
* Form new tree from the two least trees using a free node
* as root. Put the new tree in the list.
*/
frnp = &node[freenode]; /* address of next free node */
list[0] = freenode++; /* put at top for now */
frnp->lchild = lch;
frnp->rchild = rch;
frnp->weight = node[lch].weight + node[rch].weight;
frnp->tdepth = 1 + maxchar(node[lch].tdepth, node[rch].tdepth);
/* reheap list to get least tree at top */
adjust(list, 0, len - 1);
}
dctreehd = list[0]; /* head of final tree */
}
static int
maxchar(a, b)
{
return a > b ? a : b;
}
static void
init_enc()
{
register int i;
/* Initialize encoding table */
for (i = 0; i < NUMVALS; ++i)
codelen[i] = 0;
}
/*
* Recursive routine to walk the indicated subtree and level and maintain the
* current path code in bstree. When a leaf is found the entire code string
* and length are put into the encoding table entry for the leaf's data value
* .
*
* Returns ERROR if codes are too long.
*/
static int
buildenc(level, root)
int level; /* level of tree being examined, from zero */
int root; /* root of subtree is also data value if leaf */
{
register int l, r;
l = node[root].lchild;
r = node[root].rchild;
if (l == NOCHILD && r == NOCHILD) { /* Leaf. Previous path
* determines bit string code
* of length level (bits 0 to
* level - 1). Ensures unused
* code bits are zero. */
codelen[root] = level;
code[root] = tcode & (((unsigned short ) ~0) >> (16 - level));
return (level > 16) ? ERROR : 0;
} else {
if (l != NOCHILD) { /* Clear path bit and continue deeper */
tcode &= ~(1 << level);
if (buildenc(level + 1, l) == ERROR)
return ERROR; /* pass back bad statuses */
}
if (r != NOCHILD) { /* Set path bit and continue deeper */
tcode |= 1 << level;
if (buildenc(level + 1, r) == ERROR)
return ERROR; /* pass back bad statuses */
}
}
return NULL; /* it worked if we reach here */
}
static void
put_int(n, f) /* output an integer */
short n; /* integer to output */
FILE *f; /* file to put it to */
{
void putc_pak();
putc_pak(n & 0xff, f); /* first the low byte */
putc_pak(n >> 8, f); /* then the high byte */
}
/* Write out the header of the compressed file */
static long
wrt_head(ob)
FILE *ob;
{
register int l, r;
int i, k;
int numnodes; /* # of nodes in simplified tree */
/*
* Write out a simplified decoding tree. Only the interior nodes are
* written. When a child is a leaf index (representing a data value)
* it is recoded as -(index + 1) to distinguish it from interior
* indexes which are recoded as positive indexes in the new tree.
*
* Note that this tree will be empty for an empty file.
*/
numnodes = dctreehd < NUMVALS ? 0 : dctreehd - (NUMVALS - 1);
put_int(numnodes, ob);
for (k = 0, i = dctreehd; k < numnodes; ++k, --i) {
l = node[i].lchild;
r = node[i].rchild;
l = l < NUMVALS ? -(l + 1) : dctreehd - l;
r = r < NUMVALS ? -(r + 1) : dctreehd - r;
put_int(l, ob);
put_int(r, ob);
}
return sizeof(short) + numnodes * 2 * sizeof(short);
}
/*
* Get an encoded byte or EOF. Reads from specified stream AS NEEDED.
*
* There are two unsynchronized bit-byte relationships here. The input stream
* bytes are converted to bit strings of various lengths via the static
* variables named c... These bit strings are concatenated without padding to
* become the stream of encoded result bytes, which this function returns one
* at a time. The EOF (end of file) is converted to SPEOF for convenience and
* encoded like any other input value. True EOF is returned after that.
*/
static int
gethuff(ib) /* Returns bytes except for EOF */
FILE *ib;
{
int rbyte; /* Result byte value */
int need; /* number of bits */
int getc_ncr();
rbyte = 0;
need = 8; /* build one byte per call */
/*
* Loop to build a byte of encoded data. Initialization forces read
* the first time.
*/
loop:
if (cbitsrem >= need) { /* if current code is big enough */
if (need == 0)
return rbyte;
rbyte |= ccode << (8 - need); /* take what we need */
ccode >>= need; /* and leave the rest */
cbitsrem -= need;
return rbyte & 0xff;
}
/* We need more than current code */
if (cbitsrem > 0) {
rbyte |= ccode << (8 - need); /* take what there is */
need -= cbitsrem;
}
/* No more bits in current code string */
if (curin == SPEOF) { /* The end of file token has been encoded. If
* result byte has data return it and do EOF
* next time. */
cbitsrem = 0;
return (need == 8) ? EOF : rbyte + 0;
}
/* Get an input byte */
if ((curin = getc_ncr(ib)) == EOF)
curin = SPEOF; /* convenient for encoding */
ccode = code[curin]; /* get the new byte's code */
cbitsrem = codelen[curin];
goto loop;
}
/*
* This routine is used to perform the actual squeeze operation. It can only
* be called after the file has been scanned. It returns the true length of
* the squeezed entry.
*/
long
file_sq(f, t) /* squeeze a file into an archive */
FILE *f; /* file to squeeze */
FILE *t; /* archive to receive file */
{
int c; /* one byte of squeezed data */
long size; /* size after squeezing */
size = wrt_head(t); /* write out the decode tree */
while ((c = gethuff(f)) != EOF) {
putc_pak(c, t);
size++;
}
return size; /* report true size */
}